You don't need to shift all the counters in every cycle. You can shift them
one by one (as in B'TMAN engine). What you need is to output all the values every cycle.

Let's suppose we have
- 6 registers for output values: a'b'd'e'h'l'
- 3 register pairs for frequency - bc,de,sp
- 1 register pair for addition - hl
- 1 regiser pair for loop - ix

exx
out (c),b
ld reg,a /nop(for a')
out (c),d
nop
out (c),e
exa /nop(for a')
out (c),a
exa
out (c),h
nop
out (c),l
xor a
out (c),a ;mute the last output
exx
:+116
1/2*ld hl,N
add hl,bc/de/sp
ld (),hl
ld a,h ;
 cp N  ;ld a,N:cp h for counter-phase
 sbc a,a
 and #18
1/2*ld hl,N ;for the next sub-channel
:49 / 69
1/2*dec lx:jp nz,
:18

how to fit in 184 t-states (116+68):
--- channel 1
116 (write a')
+10 (ld hl,N1) +51 (add hl,bc:ld (N1),hl:ld a,h:cp hy:sbc a,a:and ly) +7 (ld l,N)
--- channel 1 inverted
116 (write b)
4 (ld a,l) +10 (ld hl,N1i) +43 (add hl,bc:ld (N1i),hl:cp h:sbc a,a:and ly) +10 (ld hl,N2) = 67
instead of 68!!!
--- channel 2
116 (write d)
9 (xor a:ret nz) +49 (add hl,de:ld (N2),hl:ld a,h:cp N:sbc a,a:and #18) +10 (ld hl,N2i)
--- channel 2 inverted
116 (write e)
9 (xor a:ret nz) +49 (add hl,de:ld (N2i),hl:ld a,N:cp h:sbc a,a:and #18) +10 (ld hl,N3)
--- channel 3
116 (write h)
9 (xor a:ret nz) +49 (add hl,sp:ld (N3),hl:ld a,h:cp N:sbc a,a:and #18) +10 (ld hl,N3i)
--- channel 3 inverted
116 (write l)
50 (add hl,sp:ld (N3i),hl:ld a,N:cp h:sbc a,a:and ly)
18 (dec lx:jp nz)

so the final code is:

;bc=freq1
;de=freq2
;sp=freq3
;c'=#fe
;b',d',e',h',l'=0 or #18
;hy=duty1
;ly=#18
        ld ix,tempo+256 ;in 1104 (or 1103) t-units, so hx might be unnecessary
beeploop
duty3i equ $+1
        ld a,0
        cp h
        sbc a,a
        and iyl			;=#18

;--- channel 1
        exx
        out (c),b
        nop			;write a'
        out (c),d
        nop
        out (c),e
        nop			;write a'
        out (c),a
        ex af,af'
        out (c),h
        nop
        out (c),l
        xor a
        out (c),a		;mute the last output
        exx
	 neg
N1 equ $+1
        ld hl,0
        add hl,bc
        ld (N1),hl
        ld a,h
        cp iyh			;=duty1
        sbc a,a
        and iyl			;=#18
duty1i equ $+1
        ld l,0

;--- channel 1 inverted
        exx
        out (c),b
        ld b,a
        out (c),d
        nop
        out (c),e
        ex af,af'
        out (c),a
        ex af,af'
        out (c),h
        nop
        out (c),l
        xor a
        out (c),a		;mute the last output
        exx
	 ld a,r
        ld a,l			;duty1i
N1i equ $+1
        ld hl,0
        add hl,bc
        ld (N1i),hl
        cp h
        sbc a,a
        and iyl
N2 equ $+1
        ld hl,0
				;67 instead of 68!!!
;--- channel 2
        exx
        out (c),b
        ld d,a
        out (c),d
        nop
        out (c),e
        ex af,af'
        out (c),a
        ex af,af'
        out (c),h
        nop
        out (c),l
        xor a
        out (c),a		;mute the last output
        exx
	 neg
        xor a
        ret nz
        add hl,de
        ld (N2),hl
        ld a,h
duty2 equ $+1
        cp 0
        sbc a,a
        and #18
N2i equ $+1
        ld hl,0

;--- channel 2 inverted
        exx
        out (c),b
        ld e,a
        out (c),d
        nop
        out (c),e
        ex af,af'
        out (c),a
        ex af,af'
        out (c),h
        nop
        out (c),l
        xor a
        out (c),a		;mute the last output
        exx
	 neg
        xor a
        ret nz
        add hl,de
        ld (N2i),hl
duty2i equ $+1
        ld a,0
        cp h
        sbc a,a
        and #18
N3 equ $+1
        ld hl,0

;--- channel 3
        exx
        out (c),b
        ld h,a
        out (c),d
        nop
        out (c),e
        ex af,af'
        out (c),a
        ex af,af'
        out (c),h
        nop
        out (c),l
        xor a
        out (c),a		;mute the last output
        exx
	 neg
        xor a
        ret nz
        add hl,sp
        ld (N3),hl
        ld a,h
duty3 equ $+1
        cp 0
        sbc a,a
        and #18
N3i equ $+1
        ld hl,0

;--- channel 3 inverted
        exx
        out (c),b
        ld l,a
        out (c),d
        nop
        out (c),e
        ex af,af'
        out (c),a
        ex af,af'
        out (c),h
        nop
        out (c),l
        xor a
        out (c),a		;mute the last output
        exx
	 neg
        add hl,sp
        ld (N3i),hl
        dec ixl
        jp nz,beeploop

;--- for constant sound
        ld a,0
        cp h
        sbc a,a
        and iyl			;=#18

        exx
        out (c),b
        nop
        out (c),d
        nop
        out (c),e
        ex af,af'
        out (c),a
        ex af,af'
        out (c),h
        nop
        out (c),l
        xor a
        out (c),a		;mute the last output
        exx
	 neg
        add hl,sp
        ld (0),hl
	dec ixh
        jp nz,beeploop 


Note, you can change the duty cycles for half-channels independently,
and this will give different timbres.
So at the same time you can change:
- volume (phase shift)
- duty cycle
- duty cycle for inverted phase

With the usual method of volume=0 (i.e. with 1-0 meander)
the duty cycles of inverted channels must be minus 1 (not equal)
to the duty cycles of regular channels.
Or else the output of channel1 and channel1i (and so on) won't be in exact
counter-phase.

This is because:

        ld a,h ;N1
        cp iyh                  ;=duty1
        sbc a,a ;set if N1<duty1

vs
        ld a,l                  ;duty1i
        cp h ;N1i
        sbc a,a ;set if N1i>duty1i (i.e. N1i>=duty1i+1, not duty1i)





I avoid loops more than 224 t-states because of <16 kHz parasite tones.
I tried 4-channel inverting phases with 8xOUTD but that routine exceeds 224 t-states.
Also with 4 channels we have a big loop consisting of 8 sub-loops,
so every channel has 2 kHz discretization that is way too small!

The louder an engine is and the more time between
OUTs, the less quantization we hear. That's why I suggest the new
"loud" engine.

Using the method from 3-channel routine, we can make 2 channels of inverting phases plus 1
regular channel that updates every sub-loop (for the higher pitches).


ld a,dutyN ;/hx for duty1
;28
 out (c),b ;channel1
exx
Nn=$+1
ld hl,0 ;double
add hl,bc/de ;double
ex af,af'
;29
 out (#fe),a ;channel1i
ex af,af'
sub h/add a,h ;double
ld (Nn),hl ;double
exx
;28
 out (c),d ;channel2
sbc a,a
and ly;#18 ;double
ld reg,a ;/ex af,af'
ld a,hy;duty_regular
sub h ;regular
;28
 out (c),e ;channel2i
sbc a,a
and #18
and #18
add hl,sp ;regular
;29
 out (#fe),a ;channel3_regular
[dec lx:jr nz ;for the last sub-loop]
[ld a,r:jr $+2 ;for the rest]
;20 for the last sub-loop ;/21 for the rest

for phase shift=0:
channelN: 1 if N>dutyN
channelNi: 0 if Ni<=255-dutyNi (because it's 1 if N1+dutyNi>=256, so
it's 0 if N1+dutyNi<256, it's 0 if N1+dutyNi<=255)
The consequence is:
1) duty for inverse channel means the same as for the regular channel.
2) but these channels are shifted from one another.
For example, for duty=dutyi=#20:
channel1: 0 when H=#00..#1f, 1 otherwise
channel1i: 0 when H=#00..#df, 1 otherwise
3) So, phase shift = 0 means the maximum volume for any given duty cycles.
4) The minimum volume for equal duty cycles (don't use, because the sub-channels can't completely hush one another, so there is noise of double the channel's frequency) is:
phase shift = -duty.
5) The minimum volume for unequal duty cycles is:
phase shift = -smallest_of_duties.

Four sub-loops for one big loop:
ld hl,N1, sub h, write b
ld hl,N1i, add a,h, write a'
ld hl,N2, sub h, write d
ld hl,N2i, add a,h, write e
Total = 200+200+200+200

So the final code is

loop:
ld a,hx ;duty1
 out (c),b ;channel1
exx
N1=$+1
ld hl,0 ;double
add hl,bc ;double
ex af,af'
 out (#fe),a ;channel1i
ex af,af'
sub h ;double
ld (N1),hl ;double
exx
 out (c),d ;channel2
sbc a,a
and ly;#18 ;double
ld b,a
ld a,hy;duty_regular
sub h ;regular
 out (c),e ;channel2i
sbc a,a
and #18
and #18
add hl,sp ;regular
 out (#fe),a ;channel3_regular
ld a,r
jr $+2

duty1i=$+1
ld a,0
 out (c),b ;channel1
exx
N1i=$+1
ld hl,0 ;double
add hl,bc ;double
ex af,af'
 out (#fe),a ;channel1i
ex af,af'
add a,h ;double
ld (N1i),hl ;double
exx
 out (c),d ;channel2
sbc a,a
and ly;#18 ;double
ex af,af'
ld a,hy;duty_regular
sub h ;regular
 out (c),e ;channel2i
sbc a,a
and #18
and #18
add hl,sp ;regular
 out (#fe),a ;channel3_regular
ld a,r
jr walkaround

endwalkaround
duty2i=$+1
ld a,0
 out (c),b ;channel1
exx
N2i=$+1
ld hl,0 ;double
add hl,de ;double
ex af,af'
 out (#fe),a ;channel1i
ex af,af'
add a,h ;double
ld (N2i),hl ;double
exx
 out (c),d ;channel2
sbc a,a
and ly;#18 ;double
ld e,a
ld a,hy;duty_regular
sub h ;regular
 out (c),e ;channel2i
sbc a,a
and #18
and #18
add hl,sp ;regular
 out (#fe),a ;channel3_regular
dec lx
jr nz,loop
;ld a,(timerhigh)
;sub 1
;ld (timerhigh),a
;jr nz,loop ;adds 40 t-states
jp ...

walkaround
duty2=$+1
ld a,0
 out (c),b ;channel1
exx
N2=$+1
ld hl,0 ;double
add hl,de ;double
ex af,af'
 out (#fe),a ;channel1i
ex af,af'
sub h ;double
ld (N2),hl ;double
exx
 out (c),d ;channel2
sbc a,a
and ly;#18 ;double
ld d,a
ld a,hy;duty_regular
sub h ;regular
 out (c),e ;channel2i
sbc a,a
and #18
and #18
add hl,sp ;regular
 out (#fe),a ;channel3_regular
ld a,r
jr endwalkaround

